Load data

### setwd("/Users/seantrott/Dropbox/UCSD/Research/Ambiguity/SSD/spanish_norms/src/analysis/")
### Read in all data
df_final = read_csv("../../data/processed/human/sawc_relatedness_full_critical_data.csv")
nrow(df_final)
## [1] 10639
length(unique(df_final$Participant))
## [1] 131

RQ1: Same vs. Different Sense

df_ratio = df_final %>% 
  group_by(Same_sense) %>%
  mutate(count_condition = n()) %>%
  ungroup() %>%
  group_by(Same_sense, Response, count_condition) %>%
  summarise(count_response = n()) %>%
  mutate(prop_response = count_response / count_condition)
              
df_ratio %>%         
  ggplot(aes(x = Response,
             y = prop_response)) +
  geom_bar(alpha = .6, stat = "identity") +
  theme_minimal() +
  labs(x = "Relatedness",
       y = "P(Response | Condition)") +
  facet_wrap(~Same_sense) +
  theme(text = element_text(size = 15),
        legend.position="none")

df_final %>%         
  ggplot(aes(x = Response)) +
  geom_bar(alpha = .6, stat = "count") +
  theme_minimal() +
  labs(x = "Relatedness",
       y = "Count") +
  facet_wrap(~Same_sense) +
  theme(text = element_text(size = 15),
        legend.position="none")

df_final %>%
  group_by(Same_sense) %>%
  summarise(m_rel = mean(Response),
            sd_rel = sd(Response))
## # A tibble: 2 × 3
##   Same_sense      m_rel sd_rel
##   <chr>           <dbl>  <dbl>
## 1 Different Sense  2.11   1.41
## 2 Same Sense       4.35   1.14
mod_full = lmer(data = df_final,
                Response ~ Same_sense +
                  (1 + Same_sense | Participant) + 
                  (1 + Same_sense | List) + (1 | Word),
                REML = FALSE)

mod_reduced = lmer(data = df_final,
                Response ~ # Same_sense +
                  (1 + Same_sense | Participant) + 
                  (1 + Same_sense | List) + (1 | Word),
                REML = FALSE)

summary(mod_full)
## Linear mixed model fit by maximum likelihood . t-tests use Satterthwaite's
##   method [lmerModLmerTest]
## Formula: Response ~ Same_sense + (1 + Same_sense | Participant) + (1 +  
##     Same_sense | List) + (1 | Word)
##    Data: df_final
## 
##      AIC      BIC   logLik deviance df.resid 
##  34198.6  34271.3 -17089.3  34178.6    10629 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.8340 -0.6554 -0.1521  0.6411  3.2534 
## 
## Random effects:
##  Groups      Name                 Variance Std.Dev. Corr 
##  Participant (Intercept)          0.13240  0.3639        
##              Same_senseSame Sense 0.13065  0.3614   -0.59
##  Word        (Intercept)          0.32042  0.5661        
##  List        (Intercept)          0.02791  0.1671        
##              Same_senseSame Sense 0.07990  0.2827   -0.55
##  Residual                         1.35993  1.1662        
## Number of obs: 10639, groups:  Participant, 131; Word, 102; List, 10
## 
## Fixed effects:
##                      Estimate Std. Error       df t value Pr(>|t|)    
## (Intercept)           2.11454    0.08478 29.45271   24.94  < 2e-16 ***
## Same_senseSame Sense  2.24722    0.09838  9.67540   22.84 9.65e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr)
## Sm_snsSmSns -0.406
anova(mod_full, mod_reduced)
## Data: df_final
## Models:
## mod_reduced: Response ~ (1 + Same_sense | Participant) + (1 + Same_sense | List) + (1 | Word)
## mod_full: Response ~ Same_sense + (1 + Same_sense | Participant) + (1 + Same_sense | List) + (1 | Word)
##             npar   AIC   BIC logLik deviance  Chisq Df Pr(>Chisq)    
## mod_reduced    9 34236 34302 -17109    34218                         
## mod_full      10 34199 34271 -17089    34179 39.589  1  3.135e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

RQ2 + RQ5: Correlation with BETO

Load and process BETO data

### BETO distances
df_beto_distances = read_csv("../../data/processed/models/sawc-distances_model-bert-base-spanish-wwm-cased.csv")
nrow(df_beto_distances)
## [1] 10556
### basic items to get Tag
df_sawc_items = read_csv("../../data/raw/items/sawc_sentence_pairs.csv")

### merge together
df_merged = df_beto_distances %>%
  left_join(df_sawc_items)
nrow(df_merged)
## [1] 10556

Merge with SAW-C Norms

df_list_mean = df_final %>%
  group_by(List, Word, Tag) %>%
  summarise(mean_relatedness = mean(Response), .groups = "drop",
            count = n())
nrow(df_list_mean)
## [1] 812
df_merged_beto = df_merged %>%
  inner_join(df_list_mean)
nrow(df_merged_beto)
## [1] 10556

RQ2: Correlation by layer

df_by_layer = df_merged_beto %>%
  group_by(Layer) %>%
  summarise(r = cor(mean_relatedness, Distance, method = "pearson"),
            r2 = r ** 2,
            rho = cor(mean_relatedness, Distance, method = "spearman"),
            count = n())

summary(df_by_layer$rho)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -0.58780 -0.57770 -0.56818 -0.48504 -0.47098  0.02152
summary(df_by_layer$r2)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.000542 0.220868 0.305393 0.254792 0.321244 0.332196
summary(df_by_layer$r)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -0.57636 -0.56678 -0.55262 -0.47774 -0.46997  0.02328
df_by_layer %>%
  ggplot(aes(x = Layer,
             y = r)) +
  geom_line(size = 2,
            color = "cornflower blue") +
  theme_minimal() +
  labs(x = "Layer (BETO)",
       y = "Pearson's r") +
  scale_x_continuous(breaks = c(0:13)) +
  theme(text = element_text(size = 15),
        legend.position="none")

df_by_layer %>%
  ggplot(aes(x = Layer,
             y = rho)) +
  geom_line(size = 2,
            color = "cornflower blue") +
  theme_minimal() +
  labs(x = "Layer (BETO)",
       y = "Spearman's rho") +
  scale_x_continuous(breaks = c(0:13)) +
  theme(text = element_text(size = 15),
        legend.position="none")

df_by_layer %>%
  ggplot(aes(x = Layer,
             y = r2)) +
  geom_line(size = 2,
            color = "cornflower blue") +
  theme_minimal() +
  labs(x = "Layer (BETO)",
       y = "R2") +
  scale_x_continuous(breaks = c(0:13)) +
  theme(text = element_text(size = 15),
        legend.position="none")

min(df_by_layer$rho)
## [1] -0.5878024

RQ5: Expected layer

df_wide <- df_merged_beto %>%
  pivot_wider(
    names_from = Layer,       # This specifies where to get the names of the new columns
    values_from = Distance    # This specifies what values to fill the new columns with
  ) %>%
rename_with(.cols = matches("^[0-9]+$"),  
              .fn = ~ paste0("Layer", as.integer(.) + 1))  


base_formula <- "mean_relatedness ~ "

# Create a vector to hold all layer names that you want to include in the models
layer_names <- colnames(df_wide)[grepl("Layer", colnames(df_wide))]

# Generate the model formulas incrementally
formulas <- sapply(seq_along(layer_names), function(i) {
  paste(base_formula, paste(layer_names[1:i], collapse = " + "))
})

# Initialize a vector to store R-squared values
r_squared <- numeric(length(formulas))

# Loop over the formulas
for (i in seq_along(formulas)) {
  model <- lm(formulas[i], data = df_wide)
  r_squared[i] <- summary(model)$r.squared
}


df_results_expected = data.frame(r_squared, layer_names) %>%
  mutate(r2 = r_squared) %>%
  mutate(r2_delta = c(NA, diff(r2))) %>%
  mutate(Layer = as.numeric(gsub("Layer", "", layer_names)) - 1) %>%
  mutate(weighted_layer = Layer * r2_delta)

expected_layer = sum(df_results_expected$weighted_layer, na.rm = TRUE) / sum(df_results_expected$r2_delta, na.rm = TRUE)
expected_layer
## [1] 2.980695

RQ3: Cosine distance vs. Same/Different

Now, we select the best-performing layer from BETO.

df_beto_l5 = df_merged %>%
  filter(Layer == 7) %>%
  select(-Same_sense)
nrow(df_beto_l5)
## [1] 812
df_experimental_with_beto = df_final %>%
  left_join(df_beto_l5)
nrow(df_experimental_with_beto)
## [1] 10639
mod_full = lmer(data = df_experimental_with_beto,
                Response ~ Same_sense + Distance +
                  (1 + Same_sense + Distance | Participant) + 
                  (1 | List) + (1 | Word),
                REML = FALSE)

mod_reduced = lmer(data = df_experimental_with_beto,
                Response ~ Distance + # Same_sense +
                  (1 + Same_sense + Distance | Participant) + 
                  (1 | List) + (1 | Word),
                REML = FALSE)

mod_just_same = lmer(data = df_experimental_with_beto,
                Response ~ Same_sense + # Distance
                  (1 + Same_sense + Distance | Participant) + 
                  (1 | List) + (1 | Word),
                REML = FALSE)

summary(mod_full)
## Linear mixed model fit by maximum likelihood . t-tests use Satterthwaite's
##   method [lmerModLmerTest]
## Formula: Response ~ Same_sense + Distance + (1 + Same_sense + Distance |  
##     Participant) + (1 | List) + (1 | Word)
##    Data: df_experimental_with_beto
## 
##      AIC      BIC   logLik deviance df.resid 
##  33795.9  33883.2 -16886.0  33771.9    10627 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.0633 -0.6341 -0.1164  0.6223  3.5636 
## 
## Random effects:
##  Groups      Name                 Variance Std.Dev. Corr       
##  Participant (Intercept)          0.23178  0.4814              
##              Same_senseSame Sense 0.19301  0.4393   -0.51      
##              Distance             0.87600  0.9359   -0.69 -0.15
##  Word        (Intercept)          0.28323  0.5322              
##  List        (Intercept)          0.01825  0.1351              
##  Residual                         1.30359  1.1417              
## Number of obs: 10639, groups:  Participant, 131; Word, 102; List, 10
## 
## Fixed effects:
##                       Estimate Std. Error        df t value Pr(>|t|)    
## (Intercept)            2.91931    0.09022  74.77499   32.36   <2e-16 ***
## Same_senseSame Sense   1.87369    0.04923 144.30838   38.06   <2e-16 ***
## Distance              -4.26971    0.22414 306.55782  -19.05   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) Sm_sSS
## Sm_snsSmSns -0.389       
## Distance    -0.523  0.300
anova(mod_full, mod_reduced)
## Data: df_experimental_with_beto
## Models:
## mod_reduced: Response ~ Distance + (1 + Same_sense + Distance | Participant) + (1 | List) + (1 | Word)
## mod_full: Response ~ Same_sense + Distance + (1 + Same_sense + Distance | Participant) + (1 | List) + (1 | Word)
##             npar   AIC   BIC logLik deviance  Chisq Df Pr(>Chisq)    
## mod_reduced   11 34125 34205 -17052    34103                         
## mod_full      12 33796 33883 -16886    33772 331.55  1  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(mod_full, mod_just_same)
## Data: df_experimental_with_beto
## Models:
## mod_just_same: Response ~ Same_sense + (1 + Same_sense + Distance | Participant) + (1 | List) + (1 | Word)
## mod_full: Response ~ Same_sense + Distance + (1 + Same_sense + Distance | Participant) + (1 | List) + (1 | Word)
##               npar   AIC   BIC logLik deviance  Chisq Df Pr(>Chisq)    
## mod_just_same   11 34002 34082 -16990    33980                         
## mod_full        12 33796 33883 -16886    33772 208.27  1  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
### Visualize
df_experimental_with_beto %>%
  mutate(distance_binned = ntile(Distance, 20)) %>%
  group_by(Same_sense, distance_binned) %>%
  summarize(
    mean_relatedness = mean(Response),
    sd_relatedness = sd(Response),
    count = n(),
    se_relatedness = sd_relatedness / sqrt(count),
  ) %>%
  ggplot(aes(x = distance_binned, 
             y = mean_relatedness, 
             color = Same_sense, 
             fill = Same_sense)) +
  geom_line(size = 1.5) +
  geom_ribbon(aes(ymin = mean_relatedness - se_relatedness, 
                  ymax = mean_relatedness + se_relatedness), 
              alpha = 0.8,
              color = NA) +
  labs(x = "BETO Cosine Distance (Binned)",
       y = "Relatedness",
       color = "Same Sense",
       fill = "Same Sense") +
  theme_minimal() +
  scale_fill_manual(values = my_colors)  +
  scale_color_manual(values = my_colors)  +
  theme(text = element_text(size = 15),
        legend.position="bottom")

RQ4: BETO vs. inter-annotator agreement

### First, get group means
df_list_mean = df_experimental_with_beto %>%
  group_by(List, Word, Tag, Same_sense) %>%
  summarise(mean_relatedness = mean(Response), .groups = "drop",
            count = n(),
            distance = mean(Distance))
nrow(df_list_mean)
## [1] 812
### Get BETO cor
BETO_COR = abs(cor(df_list_mean$mean_relatedness,
               df_list_mean$distance, method = "spearman"))

### Now, iterate through ppts
ppts = unique(df_final$Participant)
df_r = data.frame()

for (ppt in ppts) {
  # Subset df_critical for the current participant
  individual_data <- df_final %>%
    filter(Participant == ppt) %>%
    select(List, Word, Tag, Response)  # Ensure you're selecting the needed columns

  # Merge individual data with mean data
  merged_data <- inner_join(individual_data, df_list_mean, by = c("List", "Word", "Tag"))
  
  test = cor.test(merged_data$Response,
                  merged_data$mean_relatedness,
                 method = "spearman")
  
  df_test = broom::tidy(test)
  df_test$ppt = ppt
  df_test$List = unique(individual_data$List)
  df_r = rbind(df_r, df_test)

}

### Visualization
df_r %>%
  ggplot(aes(x = estimate)) +
  geom_histogram(alpha = .6) +
  scale_x_continuous(limits = c(0, 1)) +
  theme_minimal() +
  geom_vline(xintercept = BETO_COR, size = 1.5, 
             linetype = "dashed", alpha = .7) +
  labs(x = "Leave-one-out Correlation") +
  theme(text = element_text(size = 15),
        legend.position="none")

summary(df_r$estimate)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.3946  0.7480  0.7962  0.7725  0.8308  0.8834
### What proportion of inter-annotator agreement scores are larger?
prop_larger = df_r %>%
  mutate(larger = estimate >= BETO_COR) %>%
  summarise(mean(larger))
prop_larger
## # A tibble: 1 × 1
##   `mean(larger)`
##            <dbl>
## 1          0.947

Calculate R2

### Full model
summary(lm(data = df_list_mean,
           mean_relatedness ~ Same_sense + distance))
## 
## Call:
## lm(formula = mean_relatedness ~ Same_sense + distance, data = df_list_mean)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.43999 -0.52874 -0.05629  0.42901  2.47200 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           2.90576    0.07835   37.09   <2e-16 ***
## Same_senseSame Sense  1.86552    0.06724   27.74   <2e-16 ***
## distance             -4.08053    0.36825  -11.08   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7658 on 809 degrees of freedom
## Multiple R-squared:  0.6578, Adjusted R-squared:  0.6569 
## F-statistic: 777.5 on 2 and 809 DF,  p-value: < 2.2e-16
### Just same sense
summary(lm(data = df_list_mean,
           mean_relatedness ~ Same_sense))
## 
## Call:
## lm(formula = mean_relatedness ~ Same_sense, data = df_list_mean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.2490 -0.6487 -0.1114  0.4836  2.6364 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           2.11357    0.03437   61.49   <2e-16 ***
## Same_senseSame Sense  2.22638    0.06310   35.28   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8214 on 810 degrees of freedom
## Multiple R-squared:  0.6058, Adjusted R-squared:  0.6054 
## F-statistic:  1245 on 1 and 810 DF,  p-value: < 2.2e-16
### Just cosine distance
summary(lm(data = df_list_mean,
           mean_relatedness ~ distance))
## 
## Call:
## lm(formula = mean_relatedness ~ distance, data = df_list_mean)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.93439 -0.81273 -0.03441  0.87319  2.71132 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.29020    0.08432   50.88   <2e-16 ***
## distance    -9.02865    0.44979  -20.07   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.069 on 810 degrees of freedom
## Multiple R-squared:  0.3322, Adjusted R-squared:  0.3314 
## F-statistic: 402.9 on 1 and 810 DF,  p-value: < 2.2e-16

RQ6: Layer vs. same/different

# Calculate mean and SE of Distance
df_summary <- df_beto_distances %>%
  group_by(Layer, Same_sense) %>%
  summarise(
    mean_Distance = mean(Distance),
    sd_Distance = sd(Distance),
    count = n(),
    se_Distance = sd_Distance / sqrt(count),
    .groups = 'drop'  # Drop the automatic grouping by dplyr
  )

# Modify Same_sense
df_summary = df_summary %>%
  mutate(Same_sense = case_when(
    Same_sense == TRUE ~ "Same Sense",
    Same_sense == FALSE ~ "Different Sense"
  ))
  

df_summary %>%
  ggplot(aes(x = Layer, 
             y = mean_Distance, 
             color = Same_sense, 
             fill = Same_sense)) +
  geom_line(size = 1.5) +
  geom_ribbon(aes(ymin = mean_Distance - 2 * se_Distance, 
                  ymax = mean_Distance + 2 * se_Distance), 
              alpha = 0.8,
              color = NA) +
  labs(x = "Layer (BETO)",
       y = "Mean Distance",
       color = "",
       fill = "") +
  scale_x_continuous(breaks = c(0:13)) +
  theme_minimal() +
  # scale_fill_viridis(option = "mako", discrete=TRUE) +
  # scale_color_viridis(option = "mako", discrete=TRUE) +
  scale_fill_manual(values = my_colors)  +
  scale_color_manual(values = my_colors)  +
  theme(text = element_text(size = 15),
        legend.position="bottom")

### Centered by layer
df_summary %>%
  group_by(Layer) %>%
  mutate(Distance_scaled = mean_Distance - mean(mean_Distance)) %>%
  ggplot(aes(x = Layer, 
             y = Distance_scaled, 
             color = Same_sense, 
             fill = Same_sense)) +
  geom_line(size = 1.5) +
  geom_ribbon(aes(ymin = Distance_scaled - 2 * se_Distance, 
                  ymax = Distance_scaled + 2 * se_Distance), 
              alpha = 0.8,
              color = NA) +
  labs(x = "Layer (BETO)",
       y = "Mean Distance (Centered)",
       color = "Same Sense",
       fill = "Same Sense") +
  scale_x_continuous(breaks = c(0:13)) +
  theme_minimal() +
  # scale_fill_viridis(option = "mako", discrete=TRUE) +
  # scale_color_viridis(option = "mako", discrete=TRUE) +
  scale_fill_manual(values = my_colors)  +
  scale_color_manual(values = my_colors)  +
  theme(text = element_text(size = 15),
        legend.position="bottom")

We also build a logistic regression model for each layer.

df_wide <- df_beto_distances %>%
  pivot_wider(
    names_from = Layer,       # This specifies where to get the names of the new columns
    values_from = Distance    # This specifies what values to fill the new columns with
  ) %>%
rename_with(.cols = matches("^[0-9]+$"),  
              .fn = ~ paste0("Layer", as.integer(.) + 1))  


base_formula <- "Same_sense ~ "

# Create a vector to hold all layer names that you want to include in the models
layer_names <- colnames(df_wide)[grepl("Layer", colnames(df_wide))]

# Generate the model formulas incrementally: individual
formulas_individual <- sapply(seq_along(layer_names), function(i) {
  paste(base_formula, paste(layer_names[i], collapse = " + "))
})

# Generate the model formulas incrementally: altogether
formulas_combined <- sapply(seq_along(layer_names), function(i) {
  paste(base_formula, paste(layer_names[1:i], collapse = " + "))
})


aic_individual <- numeric(length(formulas_individual))
aic_together <- numeric(length(formulas_combined))


# Loop over the formulas
for (i in seq_along(formulas_combined)) {
  model <- glm(formulas_combined[i], data = df_wide, family = binomial())
  aic_together[i] <- summary(model)$aic
}

# Loop over the formulas
for (i in seq_along(formulas_individual)) {
  model <- glm(formulas_individual[i], data = df_wide, family = binomial())
  aic_individual[i] <- summary(model)$aic
}

# 
df_results_expected = data.frame(aic_individual, aic_together, layer_names) %>%
  mutate(aic_delta = -c(NA, diff(aic_together))) %>%
  mutate(Layer = as.numeric(gsub("Layer", "", layer_names)) - 1) %>%
  mutate(weighted_layer = Layer * aic_delta)

expected_layer = sum(df_results_expected$weighted_layer, na.rm = TRUE) / sum(df_results_expected$aic_delta, na.rm = TRUE)
expected_layer
## [1] 3.208745
df_results_expected %>%
  filter(aic_individual == min(df_results_expected$aic_individual))
##   aic_individual aic_together layer_names aic_delta Layer weighted_layer
## 1       729.0844     720.9658      Layer6  4.348899     5        21.7445

Additional visualizations

df_item_means = df_final %>%
  group_by(List, Word, Same_sense, Sentence_1, Sentence_2,
           Sense_id_s1, Sense_id_s2, Gender_s1, Gender_s2) %>%
  summarise(mean_relatedness = mean(Response),
            sd_relatedness = sd(Response),
            median_relatedness = median(Response),
            count = n())


df_item_means %>%         
  ggplot(aes(x = mean_relatedness)) +
  geom_histogram(alpha = .6, bins = 10) +
  theme_minimal() +
  labs(x = "Mean Relatedness",
       y = "Count") +
  facet_wrap(~Same_sense) +
  theme(text = element_text(size = 15),
        legend.position="none")

df_item_means %>%
  ggplot(aes(x = mean_relatedness,
             y = Same_sense,
             fill = Same_sense)) +
  geom_density_ridges2(aes(height = ..density..), 
                       color=gray(0.25), 
                       alpha = .7, 
                       scale=.85, 
                       # size=1, 
                       size = 0,
                       stat="density") +
  labs(x = "Mean Relatedness",
       y = "",
       fill = "") +
  theme_minimal()+
  scale_fill_manual(values = my_colors)  +
  theme(text = element_text(size = 15),
        legend.position="none") 

Comparing all models together

df_beto = read_csv("../../data/processed/models/sawc-distances_model-bert-base-spanish-wwm-cased.csv") %>%
  mutate(Model = "BETO-cased",
         Multilingual = "Monolingual")
df_xlm = read_csv("../../data/processed/models/sawc-distances_model-xlm-roberta-base.csv") %>%
  mutate(Model = "XLM-RoBERTa",
         Multilingual = "Multilingual")

df_mb = read_csv("../../data/processed/models/sawc-distances_model-bert-base-multilingual-cased.csv") %>%
  mutate(Model = "Multilingual BERT",
         Multilingual = "Multilingual")

df_db = read_csv("../../data/processed/models/sawc-distances_model-distilbert-base-spanish-uncased.csv") %>%
  mutate(Model = "DistilBETO",
         Multilingual = "Monolingual")

df_ab = read_csv("../../data/processed/models/sawc-distances_model-albert-base-spanish.csv") %>%
  mutate(Model = "ALBERT-base",
         Multilingual = "Monolingual")

df_at = read_csv("../../data/processed/models/sawc-distances_model-albert-tiny-spanish.csv") %>%
  mutate(Model = "ALBERT-tiny",
         Multilingual = "Monolingual")

df_axl = read_csv("../../data/processed/models/sawc-distances_model-albert-xlarge-spanish.csv") %>%
  mutate(Model = "ALBERT-xlarge",
         Multilingual = "Monolingual")

df_al = read_csv("../../data/processed/models/sawc-distances_model-albert-large-spanish.csv") %>%
  mutate(Model = "ALBERT-large",
         Multilingual = "Monolingual")

df_axxl = read_csv("../../data/processed/models/sawc-distances_model-albert-xxlarge-spanish.csv") %>%
  mutate(Model = "ALBERT-xxlarge",
         Multilingual = "Monolingual")

df_rb = read_csv("../../data/processed/models/sawc-distances_model-roberta-base-bne.csv") %>%
  mutate(Model = "RoBERTa-base",
         Multilingual = "Monolingual")

df_rl = read_csv("../../data/processed/models/sawc-distances_model-roberta-large-bne.csv") %>%
  mutate(Model = "RoBERTa-large",
         Multilingual = "Monolingual")

df_beto_uncased = read_csv("../../data/processed/models/sawc-distances_model-bert-base-spanish-wwm-uncased.csv") %>%
  mutate(Model = "BETO-uncased",
         Multilingual = "Monolingual")

df_all = df_beto %>%
  bind_rows(df_xlm) %>%
  bind_rows(df_db) %>%
  bind_rows(df_ab) %>%
  bind_rows(df_at) %>%
  bind_rows(df_mb) %>%
  bind_rows(df_axl) %>%
  bind_rows(df_al) %>%
  bind_rows(df_axxl) %>%
  bind_rows(df_rb) %>%
  bind_rows(df_rl) %>%
  bind_rows(df_beto_uncased)

nrow(df_all)
## [1] 144536
### basic items to get Tag
df_sawc_items = read_csv("../../data/raw/items/sawc_sentence_pairs.csv")

### merge together
df_merged = df_all %>%
  left_join(df_sawc_items)

df_list_mean = df_final %>%
  group_by(List, Word, Tag) %>%
  summarise(mean_relatedness = mean(Response), .groups = "drop",
            count = n())
nrow(df_list_mean)
## [1] 812
df_merged = df_merged %>%
  inner_join(df_list_mean)
nrow(df_merged)
## [1] 144536

Distance by layer, by model

df_distance_by_layer = df_merged %>%
  group_by(Model, Multilingual, Same_sense, Layer, n_params) %>%
  summarise(mean_distance = mean(Distance),
            .groups = 'drop'  # Drop the automatic grouping by dplyr
  )

nrow(df_distance_by_layer)
## [1] 356
df_distance_by_layer_wide = df_distance_by_layer %>%
  pivot_wider(names_from = "Same_sense",
              values_from = "mean_distance",
              names_prefix = "Distance_") %>%
  mutate(Diff = Distance_FALSE - Distance_TRUE)


df_families = data.frame(
  Model = unique(df_distance_by_layer_wide$Model),
  Family = c("ALBERT", "ALBERT", "ALBERT", "ALBERT",
             "ALBERT", "BERT", "BERT", "BERT", "BERT",
             "RoBERTa", "RoBERTa", "RoBERTa")
)

df_distance_by_layer_wide = df_distance_by_layer_wide %>%
  group_by(Model) %>%
  mutate(max_layer = max(Layer),
         prop_layer = Layer / max_layer)  %>%
  inner_join(df_families)


df_distance_by_layer_wide %>%
  ggplot(aes(x = prop_layer,
             y = Diff)) +
  geom_line(size = .5, aes(color = Model, group = Model),
            alpha = .3) + 
  stat_summary(
    fun = mean,                    # calculate mean
    geom = "line",                 # use line geometry
    size = 2,                    # set line size
    na.rm = TRUE                   # remove NA values
  ) +
  theme_minimal() +
  labs(x = "Proportion of Total Layers",
       y = "Mean Difference (Different - Same Sense)",
       color = "") +  # Renaming legend title appropriately
  scale_color_viridis(option = "mako", discrete = TRUE) +
  theme(text = element_text(size = 15),
        legend.position = "none") 

df_distance_by_layer_wide %>%
  mutate(binned_prop_layer = ntile(prop_layer, 10)) %>%
  mutate(prop_binned = binned_prop_layer / 10) %>%
  ggplot(aes(x = prop_binned,
             y = Diff)) +
  stat_summary(
    aes(group = Family,
        color = Family),  
    fun = mean,    
    geom = "line",        
    size = 2              
  ) +
  stat_summary(
    aes(group = Family, 
        fill = Family), 
    fun.data = mean_se,    
    geom = "ribbon",  
    alpha = 0.2,   
    color = NA     
  ) +
  theme_minimal() +
  labs(x = "Layer Depth Ratio",
       y = "Mean Cosine Distance Difference (Different - Same Sense)",
       color = "Family") +
  scale_color_viridis(option = "mako", discrete = TRUE) +
  theme(text = element_text(size = 15),
        legend.position = "bottom") 

R2 by layer, by model

df_by_layer = df_merged %>%
  group_by(Model, Multilingual, Layer, n_params) %>%
  summarise(r = cor(mean_relatedness, Distance, method = "pearson"),
            r2 = r ** 2,
            rho = cor(mean_relatedness, Distance, method = "spearman"),
            count = n())

max(df_by_layer$r2)
## [1] 0.3321961
df_by_layer %>%
  filter(r2 == max(df_by_layer$r2))
## # A tibble: 1 × 8
## # Groups:   Model, Multilingual, Layer [1]
##   Model      Multilingual Layer  n_params      r    r2    rho count
##   <chr>      <chr>        <dbl>     <dbl>  <dbl> <dbl>  <dbl> <int>
## 1 BETO-cased Monolingual      7 109850880 -0.576 0.332 -0.587   812
df_by_layer %>%
  filter(Model != "ALBERT-tiny") %>%
  filter(Model != "ALBERT-xlarge") %>%
  filter(Model != "DistilBETO") %>%
  filter(Model != "ALBERT-large") %>%
  filter(Model != "RoBERTa-large") %>%
  ggplot(aes(x = Layer,
             y = r2,
             color = Model,
             linetype = Model)) +
  geom_line(size = 2) +
  theme_minimal() +
  labs(x = "Layer",
       y = "R2",
       color = "",
       linetype = "") +
  scale_x_continuous(breaks = c(0:13)) +
  scale_color_viridis(option = "mako", discrete=TRUE) +
  theme(text = element_text(size = 12),
        legend.position="bottom")

df_by_layer %>%
  filter(Model != "ALBERT-tiny") %>%
  filter(Model != "ALBERT-xlarge") %>%
  filter(Model != "DistilBETO") %>%
  filter(Model != "ALBERT-large") %>%
  filter(Model != "RoBERTa-large") %>%
  ggplot(aes(x = Layer,
             y = r2,
             color = Model)) +
  geom_line(size = 2) +
  theme_minimal() +
  labs(x = "Layer",
       y = "R2",
       color = "") +
  scale_x_continuous(breaks = c(0:13)) +
  scale_color_viridis(option = "mako", discrete=TRUE) +
  theme(text = element_text(size = 12),
        legend.position="bottom")

df_by_layer %>%
  filter(Model %in% c("ALBERT-xlarge", 
                      "ALBERT-large",
                      "RoBERTa-large")) %>%
  ggplot(aes(x = Layer,
             y = r2,
             color = Model)) +
  geom_line(size = 2) +
  theme_minimal() +
  labs(x = "Layer",
       y = "R2",
       color = "") +
  scale_color_viridis(option = "mako", discrete=TRUE) +
  theme(text = element_text(size = 15),
        legend.position="bottom")

We can also break this down by looking at the proportion of layers through the model.

df_families = data.frame(
  Model = unique(df_by_layer$Model),
  Family = c("ALBERT", "ALBERT", "ALBERT", "ALBERT",
             "ALBERT", "BERT", "BERT", "BERT", "BERT",
             "RoBERTa", "RoBERTa", "RoBERTa")
)

df_by_layer = df_by_layer %>%
  group_by(Model) %>%
  mutate(max_layer = max(Layer),
         prop_layer = Layer / max_layer)  %>%
  inner_join(df_families)


df_by_layer %>%
  ggplot(aes(x = prop_layer,
             y = r2)) +
  geom_line(size = .5, aes(color = Model, group = Model),
            alpha = .3) + 
  stat_summary(
    fun = mean,                    # calculate mean
    geom = "line",                 # use line geometry
    size = 2,                    # set line size
    na.rm = TRUE                   # remove NA values
  ) +
  theme_minimal() +
  labs(x = "Proportion of Total Layers",
       y = "R2",
       color = "") +  # Renaming legend title appropriately
  scale_color_viridis(option = "mako", discrete = TRUE) +
  theme(text = element_text(size = 15),
        legend.position = "none") 

df_by_layer %>%
  ggplot(aes(x = prop_layer,
             y = r2,
             color = Family)) + 
  geom_smooth(size = 2) +
  scale_color_viridis(option = "mako", discrete = TRUE) +
  theme_minimal() +
  labs(x = "Proportion of Total Layers",
       y = "R2",
       color = "Family") +
  theme(text = element_text(size = 15),
        legend.position = "bottom") 

df_by_layer %>%
  mutate(binned_prop_layer = ntile(prop_layer, 10)) %>%
  mutate(prop_binned = binned_prop_layer / 10) %>%
  ggplot(aes(x = prop_binned,
             y = r2)) +
  stat_summary(
    aes(group = Family,
        color = Family),  
    fun = mean,    
    geom = "line",        
    size = 2              
  ) +
  stat_summary(
    aes(group = Family, 
        fill = Family), 
    fun.data = mean_se,    
    geom = "ribbon",  
    alpha = 0.2,   
    color = NA     
  ) +
  theme_minimal() +
  labs(x = "Layer Depth Ratio",
       y = "R2",
       color = "Family") +
  scale_color_viridis(option = "mako", discrete = TRUE) +
  theme(text = element_text(size = 15),
        legend.position = "bottom") 

Comparison by model parameters

The BETO/ALBERT/DistilBETO family of models provide a nice within-family comparison of the effect of number of parameters, since they are trained on the same datasets.

df_max_r2 = df_by_layer %>%
  group_by(Model, n_params, Multilingual) %>%
  summarise(max_r2 = max(r2))

df_max_r2 %>%
  ggplot(aes(x = n_params,
             y = max_r2,
             color = Multilingual,
             shape = Multilingual)) +
  geom_point(size = 6,
             alpha = .7) +
  geom_hline(yintercept = mean(df_r$estimate)**2, ### Human accuracy 
              linetype = "dotted", color = "red",
             size = 1.2, alpha = .5) +
  geom_text_repel(aes(label=Model), size=3) +
  scale_x_log10() +
  scale_y_continuous(limits = c(0, 1)) +
  labs(x = "Parameters",
       y = "Maximum R2",
       color = "",
       shape = "") +
  theme_minimal() +
  # guides(color="none") +
  # scale_color_viridis(option = "mako", discrete=TRUE) +
  scale_color_manual(values = my_colors)  +
  theme(text = element_text(size = 15),
        legend.position="bottom")

Same vs. Different Sense residuals

# Fit models and calculate residuals for each LLM
models <- by(df_merged, df_merged$Model, function(subdata) {
  model <- lm(mean_relatedness ~ Distance * Layer, data = subdata)
  subdata$residuals <- residuals(model)
  return(subdata)
})

# Combine the results back into a single dataframe
results <- do.call(rbind, models)

# Modify Same_sense
results = results %>%
  mutate(Same_sense = case_when(
    Same_sense == TRUE ~ "Same Sense",
    Same_sense == FALSE ~ "Different Sense"
  ))
  



results %>%
  ggplot(aes(x = residuals,
             fill = Same_sense)) +
  geom_density(alpha = .7, size = 0) +
  geom_vline(xintercept = 0, size = .6, linetype = "dashed") +
  theme_minimal() +
  # scale_fill_viridis(option = "mako", discrete=TRUE) +
  theme(text = element_text(size = 15),
        legend.position="bottom") +
  scale_fill_manual(values = my_colors)  +
  labs(x = "Residuals (Relatedness ~ Distance * Layer)",
       fill = "") +
  facet_wrap(~Model)

Comparison by demographic

### Load Chilean BERT
df_chilean_bert = read_csv("../../data/processed/models/sawc-distances_model-patana-chilean-spanish-bert.csv") %>%
  mutate(Model = "Chilean-BERT", Multilingual = "Monolingual")

### Merge with model
df_all = df_all %>%
  bind_rows(df_chilean_bert)
nrow(df_all)
## [1] 155092
### Load relatedness by demographic
df_demographic_averages = read_csv("../../data/processed/human/sawc_relatedness_by_demographic.csv") %>%
  select(-Same_sense)

### Merge together
df_llm_plus_demo = df_demographic_averages %>%
  inner_join(df_all) %>%
  select(Sentence_1, Sentence_2, Word,
         mean_relatedness_Chile, mean_relatedness_México, mean_relatedness_España,
         Model, Distance, Layer) %>%
  filter(Layer == 4) ### Just test

### Best layer per model
df_llm_plus_demo
## # A tibble: 10,556 × 9
##    Sentence_1          Sente…¹ Word  mean_…² mean_…³ mean_…⁴ Model Dista…⁵ Layer
##    <chr>               <chr>   <chr>   <dbl>   <dbl>   <dbl> <chr>   <dbl> <dbl>
##  1 Compró el aceite d… Compró… acei…     4.5    3.18      NA BETO… 0.0729      4
##  2 Compró el aceite d… Compró… acei…     4.5    3.18      NA XLM-… 0.0110      4
##  3 Compró el aceite d… Compró… acei…     4.5    3.18      NA Dist… 0.0369      4
##  4 Compró el aceite d… Compró… acei…     4.5    3.18      NA ALBE… 0.0211      4
##  5 Compró el aceite d… Compró… acei…     4.5    3.18      NA ALBE… 0.0277      4
##  6 Compró el aceite d… Compró… acei…     4.5    3.18      NA Mult… 0.0209      4
##  7 Compró el aceite d… Compró… acei…     4.5    3.18      NA ALBE… 0.00694     4
##  8 Compró el aceite d… Compró… acei…     4.5    3.18      NA ALBE… 0.0201      4
##  9 Compró el aceite d… Compró… acei…     4.5    3.18      NA ALBE… 0.0344      4
## 10 Compró el aceite d… Compró… acei…     4.5    3.18      NA RoBE… 0.0392      4
## # … with 10,546 more rows, and abbreviated variable names ¹​Sentence_2,
## #   ²​mean_relatedness_Chile, ³​mean_relatedness_México,
## #   ⁴​mean_relatedness_España, ⁵​Distance
## # ℹ Use `print(n = ...)` to see more rows
### Pivot wide
s = df_llm_plus_demo %>%
  pivot_wider(names_from = Model,
              values_from = Distance)

### Extract relevant columns
relevant_columns = s %>%
  mutate(Chile = mean_relatedness_Chile,
         México = mean_relatedness_México,
         España = mean_relatedness_España) %>%
  select(Chile, México, España, 
         `BETO-cased`, `XLM-RoBERTa`,
         DistilBETO, `ALBERT-base`, `ALBERT-tiny`,
         `Multilingual BERT`, `ALBERT-xlarge`, `ALBERT-large`,
         `ALBERT-xxlarge`, `RoBERTa-base`, `RoBERTa-large`, `BETO-uncased`,
         `Chilean-BERT`
         )

### absolute  value
relevant_cors = abs(cor(relevant_columns,  use = "complete.obs"))
relevant_cors
##                       Chile    México    España BETO-cased XLM-RoBERTa
## Chile             1.0000000 0.8508143 0.8238417  0.4994681   0.2236498
## México            0.8508143 1.0000000 0.8747620  0.5482607   0.2776124
## España            0.8238417 0.8747620 1.0000000  0.5512521   0.2524410
## BETO-cased        0.4994681 0.5482607 0.5512521  1.0000000   0.4109759
## XLM-RoBERTa       0.2236498 0.2776124 0.2524410  0.4109759   1.0000000
## DistilBETO        0.3745862 0.4047428 0.3953257  0.6350121   0.3220583
## ALBERT-base       0.3593834 0.3871968 0.3666261  0.5892588   0.4230497
## ALBERT-tiny       0.3063879 0.3365394 0.2986687  0.4209135   0.3284829
## Multilingual BERT 0.3110919 0.3362316 0.3319716  0.6122717   0.4232802
## ALBERT-xlarge     0.3404443 0.3609759 0.3480030  0.6055391   0.3848763
## ALBERT-large      0.2542477 0.2860133 0.2554225  0.5353650   0.3843066
## ALBERT-xxlarge    0.2807636 0.3015163 0.2961813  0.5704506   0.3846120
## RoBERTa-base      0.4147787 0.4424037 0.4519473  0.7277146   0.3940679
## RoBERTa-large     0.3482683 0.3698964 0.3737819  0.6685210   0.4107094
## BETO-uncased      0.3049111 0.3410787 0.3617072  0.6036837   0.3451527
## Chilean-BERT      0.4314856 0.4718126 0.4595117  0.8521254   0.4434771
##                   DistilBETO ALBERT-base ALBERT-tiny Multilingual BERT
## Chile              0.3745862   0.3593834   0.3063879         0.3110919
## México             0.4047428   0.3871968   0.3365394         0.3362316
## España             0.3953257   0.3666261   0.2986687         0.3319716
## BETO-cased         0.6350121   0.5892588   0.4209135         0.6122717
## XLM-RoBERTa        0.3220583   0.4230497   0.3284829         0.4232802
## DistilBETO         1.0000000   0.5504276   0.5686367         0.5004061
## ALBERT-base        0.5504276   1.0000000   0.5175161         0.6514023
## ALBERT-tiny        0.5686367   0.5175161   1.0000000         0.3949596
## Multilingual BERT  0.5004061   0.6514023   0.3949596         1.0000000
## ALBERT-xlarge      0.6089505   0.7189638   0.6704144         0.5878105
## ALBERT-large       0.5962305   0.7837651   0.6559533         0.6192542
## ALBERT-xxlarge     0.5839987   0.5835854   0.6802126         0.5352473
## RoBERTa-base       0.6180596   0.5649007   0.4709227         0.5408775
## RoBERTa-large      0.6102204   0.4749476   0.4949663         0.5322389
## BETO-uncased       0.8248177   0.4594634   0.4341077         0.4176955
## Chilean-BERT       0.6471521   0.5941260   0.4867991         0.5682728
##                   ALBERT-xlarge ALBERT-large ALBERT-xxlarge RoBERTa-base
## Chile                 0.3404443    0.2542477      0.2807636    0.4147787
## México                0.3609759    0.2860133      0.3015163    0.4424037
## España                0.3480030    0.2554225      0.2961813    0.4519473
## BETO-cased            0.6055391    0.5353650      0.5704506    0.7277146
## XLM-RoBERTa           0.3848763    0.3843066      0.3846120    0.3940679
## DistilBETO            0.6089505    0.5962305      0.5839987    0.6180596
## ALBERT-base           0.7189638    0.7837651      0.5835854    0.5649007
## ALBERT-tiny           0.6704144    0.6559533      0.6802126    0.4709227
## Multilingual BERT     0.5878105    0.6192542      0.5352473    0.5408775
## ALBERT-xlarge         1.0000000    0.7917851      0.7461274    0.5477237
## ALBERT-large          0.7917851    1.0000000      0.7118564    0.5302841
## ALBERT-xxlarge        0.7461274    0.7118564      1.0000000    0.5306079
## RoBERTa-base          0.5477237    0.5302841      0.5306079    1.0000000
## RoBERTa-large         0.5774232    0.5414029      0.5628845    0.7685964
## BETO-uncased          0.4750966    0.4947804      0.5216844    0.5498036
## Chilean-BERT          0.5725059    0.5746295      0.5734939    0.7260469
##                   RoBERTa-large BETO-uncased Chilean-BERT
## Chile                 0.3482683    0.3049111    0.4314856
## México                0.3698964    0.3410787    0.4718126
## España                0.3737819    0.3617072    0.4595117
## BETO-cased            0.6685210    0.6036837    0.8521254
## XLM-RoBERTa           0.4107094    0.3451527    0.4434771
## DistilBETO            0.6102204    0.8248177    0.6471521
## ALBERT-base           0.4749476    0.4594634    0.5941260
## ALBERT-tiny           0.4949663    0.4341077    0.4867991
## Multilingual BERT     0.5322389    0.4176955    0.5682728
## ALBERT-xlarge         0.5774232    0.4750966    0.5725059
## ALBERT-large          0.5414029    0.4947804    0.5746295
## ALBERT-xxlarge        0.5628845    0.5216844    0.5734939
## RoBERTa-base          0.7685964    0.5498036    0.7260469
## RoBERTa-large         1.0000000    0.5811935    0.6643691
## BETO-uncased          0.5811935    1.0000000    0.5858514
## Chilean-BERT          0.6643691    0.5858514    1.0000000
corrplot(relevant_cors,
         method = "shade",
         # type = "upper",
         tl.col = "black")

breaks <- seq(-1, 1, length.out=101) 
my_palette <- colorRampPalette(c("lightblue", "white", "lightcoral"))(100)

heatmap.2(relevant_cors,
          breaks=breaks,
          col=my_palette,
          main="Correlations",
          trace="none",
          dendrogram="row",
          Rowv=TRUE, 
          Colv=FALSE,
          scale="none",
          key=TRUE,
          keysize=1.2,
          symm=TRUE, # symmetric heatmap
          density.info="none",
          margins=c(6,6),
          cexRow=0.9, # Adjust font size of row labels if needed
          cexCol=0.9, # Adjust font size of column labels if needed
          # cellnote=round(relevant_cors, 2), # Add correlation values to cells
          notecol="black", # Color for cell values
          notecex=0.8) # Font size for cell values

Appendix

Analysis of GPT-4 Ratings

df_sawc = read_csv("../../data/processed/human/sawc_avg_relatedness.csv")
df_gpt4 = read_csv("../../data/processed/models/sawc-ratings-gpt4-turbo.csv")

df_merged_gpt4 = df_gpt4 %>%
  inner_join(df_sawc)
nrow(df_merged_gpt4)
## [1] 812
cor.test(df_merged_gpt4$GPT_rating, df_merged_gpt4$mean_relatedness,
         method = "spearman")
## 
##  Spearman's rank correlation rho
## 
## data:  df_merged_gpt4$GPT_rating and df_merged_gpt4$mean_relatedness
## S = 17918116, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.7991942
cor.test(df_merged_gpt4$GPT_rating, df_merged_gpt4$mean_relatedness,
         method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  df_merged_gpt4$GPT_rating and df_merged_gpt4$mean_relatedness
## t = 40.53, df = 810, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.7943084 0.8398946
## sample estimates:
##       cor 
## 0.8183849
mod_full = lm(data = df_merged_gpt4, 
              mean_relatedness ~ Same_sense + GPT_rating)
summary(mod_full)
## 
## Call:
## lm(formula = mean_relatedness ~ Same_sense + GPT_rating, data = df_merged_gpt4)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.33981 -0.45438 -0.04765  0.40039  1.92798 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           1.11995    0.05412   20.69   <2e-16 ***
## Same_senseSame Sense  1.13595    0.07189   15.80   <2e-16 ***
## GPT_rating            0.45208    0.02120   21.32   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6577 on 809 degrees of freedom
## Multiple R-squared:  0.7476, Adjusted R-squared:  0.747 
## F-statistic:  1198 on 2 and 809 DF,  p-value: < 2.2e-16
mod_gpt = lm(data = df_merged_gpt4, 
              mean_relatedness ~ GPT_rating)
summary(mod_gpt)
## 
## Call:
## lm(formula = mean_relatedness ~ GPT_rating, data = df_merged_gpt4)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.88145 -0.43840  0.01484  0.49944  2.62128 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.76252    0.05621   13.56   <2e-16 ***
## GPT_rating   0.69045    0.01704   40.53   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7519 on 810 degrees of freedom
## Multiple R-squared:  0.6698, Adjusted R-squared:  0.6693 
## F-statistic:  1643 on 1 and 810 DF,  p-value: < 2.2e-16
mod_sense = lm(data = df_merged_gpt4, 
              mean_relatedness ~ Same_sense)
summary(mod_sense)
## 
## Call:
## lm(formula = mean_relatedness ~ Same_sense, data = df_merged_gpt4)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.2490 -0.6487 -0.1114  0.4836  2.6364 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           2.11357    0.03437   61.49   <2e-16 ***
## Same_senseSame Sense  2.22638    0.06310   35.28   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8214 on 810 degrees of freedom
## Multiple R-squared:  0.6058, Adjusted R-squared:  0.6054 
## F-statistic:  1245 on 1 and 810 DF,  p-value: < 2.2e-16
df_merged_gpt4$residuals = residuals(mod_gpt)


df_merged_gpt4 %>%
  ggplot(aes(x = residuals,
             fill = Same_sense)) +
  geom_density(alpha = .7, size = 0) +
  geom_vline(xintercept = 0, size = .6, linetype = "dashed") +
  theme_minimal() +
  # scale_fill_viridis(option = "mako", discrete=TRUE) +
  theme(text = element_text(size = 15),
        legend.position="bottom") +
  scale_fill_manual(values = my_colors)  +
  labs(x = "Residuals (GPT-4 Turbo Ratings)",
       fill = "")